Dataset_Comparison_CCLE_NCI

2020-11-16

Setup

## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.4.3
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
## Warning: package 'circlize' was built under R version 4.0.3
## ========================================
## circlize version 0.4.11
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
## 
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
##   in R. Bioinformatics 2014.
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(circlize))
## ========================================

Overall Matrix agreement subsetted for cell lines ang genes in common and order.

RPPA? Plot in double matrix of transcripromis vs proteomics correlations ordered by pathway?

Part of of AA - > superimpose subpathways IDEAS - lineage per heatmap - RNAseq quartile vs protein levels mapping -

# load(file = "./../Project_Datasets/CCLE_OMICS.rda")
# load(file = "./../Project_Datasets/NCI60_OMICS.rda")
# load(file = "./../Project_Datasets/Metabolic_Pathway_annotations.RData")
# 
# HUMAN_9606_idmapping <- read_tsv("./../Project_Datasets/HUMAN_9606_idmapping.dat",
#                                  col_names = FALSE) %>%
#     setNames(c("Uniprot", "Type", "ID"))

load(file = "./Project_Datasets/CCLE_OMICS.rda")
load(file = "./Project_Datasets/NCI60_OMICS.rda")
load(file = "./Project_Datasets/Metabolic_Pathway_annotations.RData")
#Derms <- openxlsx::read.xlsx("./../Project_Datasets/Lineage_tissue_types.xlsx", cols = c(1,6)) %>% na.omit()
Derms <- openxlsx::read.xlsx("./Project_Datasets/Lineage_tissue_types.xlsx", cols = c(1,6)) %>% na.omit()

## the file humankegg_df if currently used for subpathways instead of this
#Ordering SMPDB by pathway count and removing duplicates. 
#This allows proteins to be in the most abundance pathway that they are in
# SMPDB <- SMPDB_pathways %>% group_by(Pathway) %>%
#   summarise(count=n()) %>% 
#   right_join(SMPDB_pathways, by = 'Pathway') %>% 
#   arrange(-count) %>% 
#   subset(!duplicated(Gene_Name)) %>% .[,-c(2:4,6:9,11)] %>%
#   pivot_longer(-Pathway, names_to = "Type", values_to = "ID") %>%
#   setNames(c("Subpathway", "Type","ID"))


#Mapping of Uniprots to all others sort of identifiers
HUMAN_9606_idmapping <- read_tsv("./Project_Datasets/HUMAN_9606_idmapping.dat",
                                 col_names = FALSE) %>%
    setNames(c("Uniprot", "Type", "ID"))

#subset hsa KEGG to Gene name
HUMAN_9606_idmapping_hsa <- HUMAN_9606_idmapping[str_detect(HUMAN_9606_idmapping$ID, "hsa:"),-2] %>% 
    left_join(HUMAN_9606_idmapping[HUMAN_9606_idmapping$Type == "Gene_Name", -2], by = "Uniprot") %>% 
    na.omit() %>% .[,-1] %>% setNames(c("KEGG_ID", "Gene.names"))

#pairwise correlation function used in main function
Correlation_pairwise <- function(x,y){
  cor(x,y,use = "pairwise", method = "pearson")
}

# Get lower triangle of the correlation matrix
  get_lower_tri<-function(cormat){
    cormat[lower.tri(cormat)] <- NA
    return(cormat)
  }
# Get upper triangle of the correlation matrix
  get_upper_tri<-function(cormat){
    cormat[upper.tri(cormat)] <- NA
    return(cormat)
  }
  
#compare 2 datasets with common Feature (rows) and Sample names (Columns)
### Change rerun value
Dataset_correlation <- function(Dataset_1,Dataset_2){
  # type = c("Transcriptomics",
  #                                                             "Proteomic",
  #                                                             "Metabolomics")
  #  type <- "Transcriptomics"
  # Type <- case_when(
  #     type == "Transcriptomics" ~ "Gene_name",
  #     type == "Proteomic" ~ "Uniprot",
  #     type == "Metabolomics" ~ "Metabolites",
  #     TRUE ~ as.character(type))

    #Dataset_1 = RNA_seq
    #Dataset_2 = NCI_60_RNA
    #Dataset_1 = CCLE_proteins
    #Dataset_2 = NCI_60_proteins
    #Dataset_1 = CCLE_Met_Common 
    #Dataset_2 = NCI_60_metabolites_common
    Comparison_type <- paste(deparse(substitute(Dataset_1)),deparse(substitute(Dataset_2)))
    Dataset1 <- as.data.frame(Dataset_1)
    Dataset2 <- as.data.frame(Dataset_2)
    Common_Cell_lines <- intersect(colnames(Dataset1),colnames(Dataset2)) #Common Samples
    Common_proteins <- intersect(rownames(Dataset1),rownames(Dataset2)) #Common Features
    
    Master_pathways <- Master_pathways[!duplicated(Master_pathways$ID),]
    ##subsetting for specific master pathways
    # Master_pathways <- Master_pathways %>% 
    #   subset(Pathway %in% c("Amino acid","Nucleotide","TCA cycle"))
    
    Master_pathways_ordered <- Master_pathways[Master_pathways$ID %in% Common_proteins,]  %>%
      left_join(humankegg_df, by = c("ID" = "Uniprot"))%>% .[order(.$Pathway, .$SubPathway),] %>%
      subset(!duplicated(ID))
      #left_join(Gene_pathways[,c("Uniprot","SubPathway")], by = c("ID" = "Uniprot")) %>% subset(!duplicated(ID))
    Master_pathways_ordered[is.na(Master_pathways_ordered)] <- "Not Known"
    
      
    
   #test that Dataset1[1,1] =! Dataset2 [1,1]
    Subset_and_matrix <- function(x){
      #x <- Dataset1
      #subsetting and ordering the Datasets
      x_sub <- x %>% .[rownames(.) %in% Common_proteins, colnames(.) %in% Common_Cell_lines]%>% 
        .[match(Common_proteins,rownames(.)),match(Common_Cell_lines,colnames(.))]
      #print(deparse(substitute(x)))
      cormat <- round(cor(t(x_sub[rownames(x_sub) %in% Master_pathways_ordered$ID,])),2) %>% 
      .[match(Master_pathways_ordered$ID,row.names(.)),match(Master_pathways_ordered$ID,row.names(.))] %>%
        {if(x[1,1] == Dataset_1[1,1]) get_lower_tri(.) else get_upper_tri(.)}
      cormat[is.na(cormat)] <- 0  
      list(matrix = cormat,
           df = x_sub)
      #cormat
    }
    list1 <- Subset_and_matrix(Dataset_1)
    list2 <- Subset_and_matrix(Dataset_2)
    Dataset1_sub <- list1$df %>% as.data.frame()
    Dataset2_sub <- list2$df %>% as.data.frame()
        #Combined_df <- rbind(cormat1,cormat2)  
    # #Combined_df$value[Combined_df$Var1 == Combined_df$Var2] <- NA
    # 
    # Feature_order <- Combined_df[!duplicated(Combined_df$Var1),] %>% 
    #   .[order(.$Pathway),"Var1"]
    # 
    # ### ComplexHeatMAP
    # test1 <- Master_pathways_ordered[order(Master_pathways_orderezd$Uniprot),]
    Metabolic_matrix <- as.data.frame(list1$matrix+list2$matrix) %>% as.matrix()
      
    
    set.seed(1587) # to set random generator seed
    cl <- colors(distinct = TRUE)
  
    
    Master_Pathway_colours <- sample(cl, length(unique(Master_pathways_ordered$Pathway))) %>% setNames(unique(Master_pathways_ordered$Pathway))
    Sub_Pathway_colours <- sample(cl, length(unique(Master_pathways_ordered$SubPathway))) %>% setNames(unique(Master_pathways_ordered$SubPathway))
    Metabolic_matrix[Metabolic_matrix ==2] <- 1
    ha = HeatmapAnnotation(Master_path = Master_pathways_ordered$Pathway,
                           Sub_path = Master_pathways_ordered$SubPathway,
                           col = list(Master_path = Master_Pathway_colours,
                                      Sub_path = Sub_Pathway_colours))
    side = rowAnnotation(Master_path = Master_pathways_ordered$Pathway,
                           Sub_path = Master_pathways_ordered$SubPathway,
                           col = list(Master_path = Master_Pathway_colours,
                                      Sub_path = Sub_Pathway_colours))
    
    hmap <- Heatmap(Metabolic_matrix, name = Comparison_type, 
            top_annotation = ha ,
            left_annotation = side,
            cluster_rows = F, cluster_columns = F, show_column_names  = F, show_row_names = F, 
            row_title = paste0("top = ", deparse(substitute(Dataset_1))), 
            column_title = paste0("bottom = ", deparse(substitute(Dataset_2))),
            column_title_side = "bottom")
    

    # ### GGPLOT
    # Combined_df %>%
    #     subset(Var1 %in% Master_pathways$Uniprot & Var2 %in% Master_pathways$Uniprot) %>% na.omit() %>%
    #     ggplot(aes(factor(Var2, levels = Feature_order), factor(Var1,levels = Feature_order), fill = value))+
    #     geom_tile(color = "white")+
    #     scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "black",
    #                          midpoint = 0, limit = c(-1,1), space = "Lab", 
    #                          name="Pearson\nCorrelation") +
    #     theme_minimal()+ 
    #     theme(axis.text.x = element_blank(), axis.text.y = element_blank())+
    #     coord_fixed() + labs(x = deparse(substitute(Dataset_1)),deparse(substitute(Dataset_2)))#+
    # facet_grid(Pathway~., scales = "free_x")
    
    
    #Correlating corresponding Samples Correlation of only paired values which allows for NA values
    Cell_line_correlation <- map2_dbl(
        Dataset1_sub, 
        Dataset2_sub, 
        Correlation_pairwise) 
    
    Cell_line_perm <- purrr::rerun(10, 
                                   map2_dbl(
                                       Dataset1_sub[,sample(ncol(Dataset1_sub))],#Shuffle column-wise and performing sample correlation as above
                                       Dataset2_sub,
                                       Correlation_pairwise)) %>% unlist()
    
    gene_correlation <- pmap(
        list(pmap(Dataset1_sub, c, use.names = T), # unlisting feature rows into vectors into lists and correlating
             pmap(Dataset2_sub, c, use.names = T)),
        Correlation_pairwise) %>% unlist() %>% setNames(Common_proteins) 
    
    gene_correlation_perm <- purrr::rerun(10, #permutation by row shuflfing and correlation
                                          pmap(list(pmap(Dataset1_sub[sample(nrow(Dataset1_sub)),], 
                                                         c, use.names = T), #Shuffle row-wise
                                                    pmap(Dataset2_sub, c, use.names = T)),
                                               Correlation_pairwise) %>% unlist()) %>% unlist()
    
    # Imputing both datasets with Media to derive overall correlation
    Dataset1_sub_imp <- naniar::impute_median_all(Dataset1_sub)
    Dataset2_sub_imp <- naniar::impute_median_all(Dataset2_sub)
    
    Matri_Corr <- MatrixCorrelation::r1(Dataset1_sub_imp,Dataset2_sub_imp) # whole matrix correlation
    
    Matrix_perm <- purrr::rerun(10, #whole matrix permutation of Dataset 1 col-wise shuffle and Dataset 2 row-wise shuffle and correlation
                            MatrixCorrelation::r1(Dataset1_sub_imp[,sample(ncol(Dataset1_sub_imp))],
                                                  Dataset2_sub_imp[sample(nrow(Dataset2_sub_imp)),])) %>% 
        unlist() # permutation
    
    Hist_plot <- function(Data,Perm){
      #Data = Cell_line_correlation
      #Perm = Cell_line_perm
        Sample_Corr <- data.frame(Samples = c(Data,unlist(Perm)),
                              Type = rep(c("Data","Perm"), times = c(length(Data),length(unlist(Perm)))))
        p <- ggplot2::ggplot(Sample_Corr,aes(Samples)) +
    geom_histogram(aes(y=..density..,fill = Type ),      # Histogram with density instead of count on y-axis
                   binwidth=.01,alpha=0.1) +
    geom_density(alpha=.5, aes(colour = Type))+
            ggtitle(paste(Comparison_type,deparse(substitute(Data))))
    }
    
    Feature_quartiles <- rowMeans(Dataset2_sub, na.rm = T) %>% data.frame(Feature = names(.),
                                                           Mean_Abundance = .) %>%
        mutate(quartile = as.factor(ntile(Mean_Abundance, 4))) %>% left_join(stack(gene_correlation), by = c("Feature" = "ind"))
    
    Feature_expression_cor_plot <- ggplot(Feature_quartiles, aes(values,after_stat(density), colour =quartile)) +
        geom_freqpoly(binwidth = 0.1) +
        ggtitle(paste("Feature Expression",Comparison_type))
    
    Sample_lineage <-  Cell_line_correlation %>% data.frame(Sample = names(.), Correlation = .) %>%
      left_join(sample_info[,c("stripped_cell_line_name", "lineage")], by = c("Sample" = "stripped_cell_line_name")) %>%
      left_join(Derms, by = c("lineage" = "Cancer_Lineage" ))
    
    Sample_lineage_cor_plot <- ggplot(Sample_lineage, aes(Correlation,after_stat(density), colour = Derm)) +
        geom_freqpoly(binwidth = 0.01) +
        ggtitle(paste("Sample Lineage",Comparison_type))
    
    
    list(Sample_Corr = list(Sample_lineage = Sample_lineage, 
                            Cell_line_perm = Cell_line_perm, 
                            Histogram = Hist_plot(Cell_line_correlation,Cell_line_perm),
                            Sample_lin_cor_plot = Sample_lineage_cor_plot),
         Feature_Corr = list(Features_quart = Feature_quartiles, 
                             Gene_corr_perm = gene_correlation_perm, 
                             Histogram = Hist_plot(gene_correlation, gene_correlation_perm), 
                             Feature_cor_plot = Feature_expression_cor_plot),
         Matrix_Corr= list(Matri_Corr = Matri_Corr, 
                           Matrix_perm = Matrix_perm,
                           Histogram =   Hist_plot(Matri_Corr,Matrix_perm)),
         Heatmap= list(Dataset1_matri = list1$matrix,
                       Dataset2_matri = list2$matrix,
                       Plot_Heatmap = hmap))
    
}

### Metabolic Comparison ###
#Metabolite_mapping <- read.csv("./../Project_Datasets/CCLE_metabolites_mapped.csv")

Metabolite_mapping <- read.csv("./Project_Datasets/CCLE_metabolites_mapped.csv")
NCI_60_metabolites <-  NCI_60_metabolites %>% separate_rows(Annotation.ID, sep = " ; ") %>%
  subset(str_detect(Annotation.ID, "^H|C") & !duplicated(Annotation.ID)) %>%
  mutate(Annotation.ID =  str_replace_all(Annotation.ID,"HMDB", "HMDB00")) 

Common_Metabolites <- Metabolite_mapping[(Metabolite_mapping$KEGG %in% (NCI_60_metabolites$Annotation.ID)) |
                                          (Metabolite_mapping$HMDB %in% (NCI_60_metabolites$Annotation.ID)) ,]

NCI_60_metabolites_common <- NCI_60_metabolites[str_detect(NCI_60_metabolites$Annotation.ID, 
                                                           paste(c(Common_Metabolites$HMDB,Common_Metabolites$KEGG), collapse = "|")),]

tt1=NCI_60_metabolites_common$Annotation.ID[str_detect(NCI_60_metabolites_common$Annotation.ID,"^C")]
NCI_60_metabolites_common$Annotation.ID[str_detect(NCI_60_metabolites_common$Annotation.ID,"^C")] <- Metabolite_mapping[Metabolite_mapping$KEGG %in% tt1,] %>%  .[match(tt1, .$KEGG),] %>% pull(HMDB)

NCI_60_metabolites_common <- NCI_60_metabolites_common%>% 
  subset(!duplicated(Annotation.ID)) %>%
  column_to_rownames("Annotation.ID")

CCLE_Met_Common <- Metabolites[rownames(Metabolites) %in% Common_Metabolites$Query,]


rownames(CCLE_Met_Common) <- Common_Metabolites$HMDB[match(rownames(CCLE_Met_Common),Common_Metabolites$Query)]

omics <- list(Transcriptome = Dataset_correlation(RNA_seq,NCI_60_RNA),
              Proteome = Dataset_correlation(CCLE_proteins,NCI_60_proteins),
              Metabolome = Dataset_correlation(CCLE_Met_Common,NCI_60_metabolites_common))

Proteomics heatmap: Purine and Pyrimidine in nucleotide = opposite trends? AA metabolism: Proteosome and Ribosomal genes Other pathways might have too many subpathways for pattern to emerge TCA: Not Known pathway genes

Sample correlation

map(omics, ~.x$Sample_Corr[[3]])
$Transcriptome


$Proteome


$Metabolome

Feature correlation

map(omics, ~.x$Feature_Corr[[3]])
$Transcriptome


$Proteome


$Metabolome

# Essentiality correlation

### Transcriptomic Comparison ###
Essential_vs_Not_Genes <- omics$Transcriptome$Feature_Corr$Features_quart %>% mutate(Essentiality = if_else(Feature %in% 
    Essential_genes, "Essential", "Not_Essential"))

ggplot(Essential_vs_Not_Genes, aes(values, after_stat(density), colour = Essentiality)) + 
    geom_freqpoly(binwidth = 0.1) + ggtitle(paste("mRNA essentiality Correlation"))

Matrix correlation

map(omics, ~.x$Matrix_Corr[[3]])
$Transcriptome


$Proteome


$Metabolome

Sample Lineage

map(omics, ~.x$Sample_Corr[[4]])
$Transcriptome


$Proteome


$Metabolome

Feature Gene expression Corr

map(omics, ~.x$Feature_Corr[[4]])
$Transcriptome


$Proteome


$Metabolome

1 is the bottom quartile and 4 the top quartile

Heatmaps

map(omics, ~.x$Heatmap)
$Transcriptome


$Proteome


$Metabolome

Here we see that the proteins might actually be more biologically relevant than the mRNA and the 2 datasets are comparable. The CCLE proteome has more NAs but this is not shown here due to ‘pairwise’ correlation. But when the data is present, the comparison is good

Transcript-Proteom Heatmap

Genes_Matrix <- omics$Transcriptome$Heatmap$Dataset1_matri 
Proteins_Matrix <- omics$Proteome$Heatmap$Dataset1_matri 

Gene_mapping <- HUMAN_9606_idmapping %>%
  subset((Type == "Gene_Name") & (Uniprot %in% rownames(Proteins_Matrix)),-2) %>%
  subset(!(duplicated(Uniprot) |  duplicated(ID))) %>%
  .[match(rownames(Proteins_Matrix),.$Uniprot),] %>% .$ID

Proteins_Matrix <- Proteins_Matrix %>% 
  magrittr::set_rownames(Gene_mapping) %>% 
  magrittr::set_colnames(Gene_mapping)

common_genes <- intersect(rownames(Genes_Matrix), rownames(Proteins_Matrix))
Master_pathways <- Master_pathways[!duplicated(Master_pathways$ID),]
Master_pathways_ordered <- Master_pathways[Master_pathways$ID %in% common_genes,] %>% .[order(.$Pathway),]# %>%
      #left_join(humankegg_df, by = c("ID" = "Uniprot")) %>% subset(!duplicated(ID))
      #left_join(Gene_pathways[,c("Uniprot","SubPathway")], by = c("ID" = "Uniprot")) %>% subset(!duplicated(ID))
Master_pathways_ordered[is.na(Master_pathways_ordered)] <- "Not Known"
common_genes <- common_genes[match(Master_pathways_ordered$ID, common_genes)]

Genes_matrix <- Genes_Matrix %>% 
  .[rownames(.) %in% common_genes,colnames(.) %in% common_genes] %>%
  .[match(common_genes,rownames(.)),match(common_genes,colnames(.))]

Proteins_Matrix <- Proteins_Matrix %>% 
  .[rownames(.) %in% common_genes,colnames(.) %in% common_genes] %>%
  .[match(common_genes,rownames(.)),match(common_genes,colnames(.))]



Combined_matrix <- as.data.frame(Genes_matrix+t(Proteins_Matrix)) %>% as.matrix()
      
    
set.seed(1587) # to set random generator seed
cl <- colors(distinct = TRUE)
  
Master_Pathway_colours <- sample(cl, length(unique(Master_pathways_ordered$Pathway))) %>% setNames(unique(Master_pathways_ordered$Pathway))
#Sub_Pathway_colours <- sample(cl, length(unique(Master_pathways_ordered$SubPathway))) %>% setNames(unique(Master_pathways_ordered$SubPathway))
Combined_matrix[Combined_matrix ==2] <- 1
ha = HeatmapAnnotation(Master_path = Master_pathways_ordered$Pathway,
                           #Sub_path = Master_pathways_ordered$SubPathway,
                           col = list(Master_path = Master_Pathway_colours))#,
                                      #Sub_path = Sub_Pathway_colours))
side = rowAnnotation(Master_path = Master_pathways_ordered$Pathway,
                           #Sub_path = Master_pathways_ordered$SubPathway,
                           col = list(Master_path = Master_Pathway_colours))#,
                                      #Sub_path = Sub_Pathway_colours))
    
hmap <- Heatmap(Combined_matrix, #name = Comparison_type, 
            top_annotation = ha ,
            left_annotation = side,
            cluster_rows = F, cluster_columns = F, show_column_names  = F, show_row_names = F, 
            row_title = paste0("top = ", "Transcriptome"), 
            column_title = paste0("bottom = ", "Proteome"),
            column_title_side = "bottom")
hmap